NBIS ID: SMS-6112
Report Version: 1.0
Request by: Mattias Westman (mattias.westman@ki.se)
Principal Investigator: Carl-Johan Sundberg (carl.j.sundberg@ki.se)
Organisation: KI
NBIS Staff: Juliana Assis (juliana.assis@nbis.se)
source('https://raw.githubusercontent.com/nimarafati/RNA_Seq_Pipeline/master/RNA_Seq/DE_script.R')
## Paths
path <- '/Users/juliana/Documents/GitHub/SMS-6112-22-MB/'
## LIBRARIES
# data handling
library(dplyr)
library(tidyr)
library(stringr)
# tables
library(kableExtra) # complete table
library(formattable) # table with conditional formatting
library(DT) # interactive table
# graphics
library(ggplot2) # static graphics
# interactive graphics
library(highcharter)
library(plotly)
library(ggiraph) # convert ggplot to interactive
library(dygraphs) # time series
library(networkD3) # network graph
library(leaflet) # interactive maps
library(crosstalk) # linking plots
# analysis
library("devtools")
library("bsselectR")
library("dplyr")
library("ggplot2")
library("tidyr")
library("DECIPHER")
library("gplots")
library("gridExtra")
library("grid")
library("ggpubr")
library("reshape2")
library("reshape")
library("ggrepel")
library("ggh4x")
library("RColorBrewer")
library("tidyverse")
library("DESeq2") # Bioconductor
library('ggfortify')
library("AnnotationDbi")
library("FactoMineR")
library("factoextra")
library("PCAtools")
library("Rqc")
library("gt")
library("clusterProfiler")
library("rnaseqGene")
#library("refGenome")
library("plotly")
library("biomaRt")
library("ggridges")
library("viridis")
library("hrbrthemes")
## Functions
#Perform DE analysis
perform_DE <- function(dds, contrast_name, log2FC_cutoff, padjust_cutoff, file_name, volcano, genes, file_path){
res_obj <- results(dds, contrast = contrast_name, lfcThreshold = log2FC_cutoff, alpha = padjust_cutoff)
res <- res_obj
res$gene_id <- rownames(res_obj)
res <- inner_join(x = as.data.frame(res), y = genes, by = 'gene_id')
res <- res[!is.na(res$padj),]
res$diffexpressed <- 'No'
res$diffexpressed[(res$log2FoldChange >= log2FC_cutoff & res$padj <= 0.05)] <- 'Up-regulated'
res$diffexpressed[(res$log2FoldChange <= -log2FC_cutoff & res$padj <= 0.05)] <- 'Down-regulated'
res$delabel <- 'NA'
res[(res$diffexpressed != 'No'), 'delabel'] <- res[(res$diffexpressed != 'No'),'gene_name']
# Volcano
if(volcano == T){
png(paste0(file_path, file_name, '_volcano.png'), width = 3000, height = 2000, res = 300)
p <- ggplot(data= as.data.frame(res), aes(x = log2FoldChange, y = -log10(padj), col = diffexpressed, label = delabel)) +
geom_point() +
theme_classic() +
scale_colour_manual(values = c('blue', 'black', 'red')) +
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff), col = 'red') +
geom_hline(yintercept = -log10(padjust_cutoff), col = 'red') +
ggtitle(file_name) +
theme(plot.title = element_text(hjust = 0.5))
print(p)
dev.off()
# ntd <- normTransform(dds)
# select <- res[(res$diffexpressed == 'Down-regulated'), 'gene_id']
# df <- as.data.frame(colData(dds)[,c('Cell_group', 'Abeta', 'Sex')])
# pheatmap(assay(ntd)[select,],
# cluster_rows = F,
# cluster_cols = T,
# show_rownames = F,
# annotation_col = df)
}
write.table(res, paste0(file_path, 'DE_results.txt'), col.names = T, row.names = F, sep = '\t', quote = F)
res_results <- list(DE_table = res,
Volcanot_plot = p,
res_obj = res_obj)
saveRDS(res_results, paste0(file_path, 'res_results.rds'))
return(res_results)
}
# Plot PC3, PC4
plotPCA_PCs <- function (object, ...)
{
.local <- function (object, intgroup = "condition", ntop = 500,
returnData = FALSE)
{
rv <- rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup,
drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = ":"))
}
else {
colData(object)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, PC1], PC2 = pca$x[, PC2], group = group,
intgroup.df, name = colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:length(percentVar)]
return(d)
}
geom_point(size = 3) + xlab(paste0(PC1, ": ", round(percentVar[PC1] *
100), "% variance")) + ylab(paste0(PC2, ": ", round(percentVar[PC2] *
100), "% variance")) + coord_fixed()
}
.local(object, ...)
}
ridgeline_plot <- function(ego_result, plot_name, fc_sorted){
t <- ego_result
t <- t %>% mutate(geneID = strsplit(as.character(geneID), '/')) %>% unnest(geneID) %>% filter(geneID != '') %>% dplyr::select(geneID, c('ONTOLOGY', 'Description', 'geneID'))
t <- data.frame(t, logFC = fc_sorted[t$geneID])
for(ont in unique(t$ONTOLOGY)){
png(paste0(plot_name, '_', ont, '.png'), res = 200, height = 2000, width = 2000)
p <- ggplot(t[(t$ONTOLOGY == ont),], aes(x = logFC, y = Description, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis(name = "logFC", option = "C") +
labs(title = paste0('logFC distribution of genes in enriched GO terms (', ont, ')')) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
)
print(p)
dev.off()
}
}
## Enrichment analysis
### GO
run_ego <- function(DEG_entrez_id, OrgDb, ont, all_entrez_id, min_gene_Size, max_gene_Size, path_to_write) {
setwd(path_to_write)
ego <- clusterProfiler::enrichGO(DEG$entrezgene_id,
OrgDb = org_db,
ont = ontology_group,
universe = all_entrez_id,
minGSSize = min_gene_Size,
maxGSSize = max_gene_Size)
if(nrow(ego) > 0 && !is.null(ego)){
write.table(ego , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
p_ego<- clusterProfiler::dotplot(ego, split="ONTOLOGY", font.size = 8, showCategory = nrow(ego), title = 'GO enrichment analysis') + facet_grid(ONTOLOGY~., scale="free") + theme(plot.title = element_text(hjust = 0.5))
## ridgeline plot
ridgeline_plot(ego_result = ego@result, plot_name = 'ridgeline_FDR_0.05', fc_sorted = fc_sorted)
}else{
write.table('No GO enrichment result' , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
p_ego <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
}
print(p_ego)
dev.off()
return(ego)
}
#### GSEGO
run_gsego <- function (fc_sorted, OrgDb, ontology_group, keyType, min_gene_Size, max_gene_Size, pvalueCutoff, path_to_write){
setwd(path_to_write)
gsego <- clusterProfiler::gseGO(geneList=fc_sorted,
ont = ontology_group,
keyType = keyType,
minGSSize = min_gene_Size,
maxGSSize = max_gene_Size,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = 'BH',
verbose = TRUE,
OrgDb = org_db)
if(!is.null(nrow(gsego)) && nrow(gsego) > 0 ){
write.table(gsego , 'GSEA_GO_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
## dotplot
png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- clusterProfiler::dotplot(gsego, showCategory=nrow(gsego), split=".sign", font.size = 8, title = 'Gene set enrichment analysis of GO terms' ) + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
print(p_gsego_dotplot)
dev.off()
## Gene set enrichment plot for each significant term
cntr <- 1
for(go_id in as.character(gsego@result$ID)){
cat('\r', paste0('visualising ',go_id,' GO'))
Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsego$Description[cntr]), replacement = "_", pattern = ' ')
png(paste0('GSEA_GO_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
p_gsego_gse <- clusterProfiler::gseaplot(gsego, by = ontology_group, title = gsego$Description[cntr], geneSetID = cntr)
print(p_gsego_gse)
dev.off()
cntr <- cntr + 1
}
}else{
write.table('No GSEA GO result', 'GSEA_GO_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsego_dotplot)
dev.off()
}
return(gsego)
}
### KEGG
run_ekegg <- function (fc_sorted, organism, keyType, all_entrez_id, minGSSize, pvalueCutoff, path_to_write){
setwd(path_to_write)
ekegg <- clusterProfiler::enrichKEGG(names(fc_sorted), organism = organism, keyType = keyType, pvalueCutoff = pvalueCutoff, pAdjustMethod = "BH", universe = as.character(all_entrez_id), maxGSSize = 1000, minGSSize = 10)
ekegg@result$ratio <- sapply(strsplit(ekegg@result$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
if(!is.null(nrow(ekegg)) && nrow(ekegg) > 0 ){
write.table(ekegg , 'enrichKEGG_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
ekegg_FDR_0.05 <- ekegg
ekegg_FDR_0.05@result <- ekegg@result[(ekegg@result$p.adjust <= 0.05), ]
png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_ekegg <- clusterProfiler::dotplot(ekegg_FDR_0.05, font.size = 8, showCategory = nrow(ekegg_FDR_0.05), title = 'KEGG enrichment analysis') + theme(plot.title = element_text(hjust = 0.5))
print(p_ekegg)
dev.off()
}else{
write.table('No KEGG enrichment result' , 'enrichKEGG_results.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsego_dotplot)
dev.off()
p_ekegg <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_ekegg)
dev.off()
return(ekegg)
}
}
#### GSEA_KEGG
run_gsekegg <- function(fc_sorted, organism, keyType, eps = 0, minGSSize, pvalueCutoff, path_to_write){
setwd(path_to_write)
gsekegg <- clusterProfiler::gseKEGG(fc_sorted, organism = org, keyType = gene_type_conversion, pvalueCutoff = pvalueCutoff, eps = eps, minGSSize = min_gene_Size, maxGSSize = max_gene_Size, pAdjustMethod = "BH", nPermSimple = 10000)
if( !is.null(nrow(gsekegg)) && nrow(gsekegg) > 0){
gsekegg_FDR_0.05 <- gsekegg
gsekegg_FDR_0.05@result <- gsekegg@result[(gsekegg@result$p.adjust <= 0.05), ]
write.table(gsekegg_FDR_0.05 , 'GSEA_KEGG_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsekegg_dotplot <- clusterProfiler::dotplot(gsekegg_FDR_0.05, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
print(p_gsekegg_dotplot)
dev.off()
}else{
write.table('NO GSEA KEGG result' , 'GSEA_KEGG_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsekegg_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsekegg_dotplot)
dev.off()
}
}
#### pathview
run_pathview <- function(gsekegg_FDR_0.05, fc_sorted, org, path_to_write){
setwd(path_to_write)
if(!is.null(nrow(gsekegg_FDR_0.05)) && nrow(gsekegg_FDR_0.05) >0 ){
cntr <- 1
for(ptw in as.character(gsekegg_FDR_0.05@result$ID)){
cat('\r', paste0('Saving ',ptw,' pathway'))
Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsekegg_FDR_0.05$Description[cntr]), replacement = "_", pattern = ' ')
pathview(gene.data=fc_sorted, pathway.id=ptw, species = org, same.layer = 2)
png(paste0('GSEA_KEGG_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
p <- gseaplot(gsekegg_FDR_0.05, by = "all", title = gsekegg_FDR_0.05$Description[cntr], geneSetID = cntr)
print(p)
dev.off()
cntr <- cntr + 1
}
}else{
png(paste0('GSEA_KEGG_FDR_0.05.png'), res = 200, height = 2000, width = 2000)
p <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p)
dev.off()
}
}Myotis brandtii are known to have at least a ten-fold longer life expectancy compared to mice of similar body mass, and the highest relative longevity of any mammals. To this end we have collected fresh snap-frozen complete organ systems from these bats during both winter hibernation and summer active period. We thus want to compare gene expressions and control of gene expressions across organs and between these groups. The whole genome is sequenced and available, but the pipeline from sequences to count tables is not established.
The list of contrasts of interest is:
genes based on the row-wise variance (no statistical support)
-Variant between Summer vs Winter (Bat1 vs Bat2)
-Variant between Organs and S/W
> –Intestine-S & Intestine-W,
–Kidney-S & Kidney-W, –Skeletal_muscle-S & Skeletal_muscle-W
There are 12 samples where 5 belongs to Library1, 5 to Library2 and 2 to Library3
Type of data
bulk RNASeq
Data location
cd /proj/snic2022-22-672/private/SMS_6112_RNASeq_Bat_Library1_2_3/
SNIC 2022/23-355 SNIC Small Storage
SNIC 2022/22-672 SNIC Small Compute
NCBI Myotis brandtii Annotation Release 101
Partial genome with low quality annotation
nfcore-rnaseq
Trinity not shown
Reads were aligned to to the Myotis brandtii Annotation Release 101 genome by using nf-core/rnaseq pipeline (3.9) in Reproducibility section you can find all the tools with version used in nf-core pipeline. For downstream analyses we used expression values generated by.
In following table you can find list of samples and metadata.
library("GenomicFeatures")
library("ensembldb")
gtf2 <- rtracklayer::import("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")
TxDb <- makeTxDbFromGFF("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")
# Convert to gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset(mart=mart,dataset="mlucifugus_gene_ensembl")
myattributes <- c("ensembl_gene_id",
"entrezgene_id",
"external_gene_name",
"chromosome_name",
"start_position",
"end_position",
"strand",
"gene_biotype",
"description")
bdata <- getBM(mart=mart,attributes=myattributes,uniqueRows=T,
useCache=FALSE)
# remove duplicated gene_ids
annotation <- bdata[!duplicated(bdata$ensembl_gene_id),c('ensembl_gene_id', 'external_gene_name', 'entrezgene_id')]
colnames(annotation) <- c('gene_id', 'Name', 'entrezgene_id')
annotation[(annotation$Name == ''), 'Name'] <- 'Uncharacterised'
write.table(annotation, paste0('/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/annotation.txt'), col.names = T, row.names = F, sep = '\t')
#ensDbFromAHHere we load normalized counts scaled for length generated by salmon (version) [@] in nf-core/rnaseq pipeline. We tested different model and dependent on the contrast we chose one of them for downstream analysis.
exp <- readRDS("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/R_Files/salmon.merged.gene_counts_length_scaled.rds")
exp2 <- exp
assay(exp, 1) <- round(data.matrix(assay(exp, 1)))
assay(exp, 2) <- NULL
# Add metadata
exp2$sample_id <- as.factor(samples_info$sample_id)
exp2$bat <- as.factor(samples_info$bat)
exp2$organ <- as.factor(samples_info$organ)
exp2$library <- as.factor(samples_info$library)
exp2$organ_bat <- as.factor(samples_info$organ_bat)
counts <- assay(exp2, "counts")
dds <- DESeqDataSetFromMatrix(countData = round(counts),
colData = samples_info,
design = ~ 0 + organ + bat)We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples).
keep <- rowSums(counts(dds)) >= 20
dds <- dds[keep,]This high-dimensional count data can - after normalization - be visualized in two dimensions in a Principal Component Analysis (PCA). Using PCA, we examine the structure of the data by projecting the samples on a two-dimensional graph using the two first principal components that explain the most variation between those samples. The plot can be used to examine the samples for outliers, sample swaps and other relationships. When normalization successfully removed technical artefacts, the relative distances should be biologically interpretable.
cols_organ <- c("Skeletal muscle" = "#264D59", "Kidney" = "#77A515", "Intestine" = "#8B4513")
vsd <- vst(dds, blind = F)
plotPCA(vsd, "library")
#assay(vsd) <- limma::removeBatchEffect(assay(vsd), vsd$library)
#plotPCA(vsd, "library")
#topVarGenes_p <- head( order( rowVars( assay(vsd) ), decreasing=TRUE ), 500 )
a <- plotPCA(vsd, intgroup = c('sample_id', 'bat', 'organ' ,'library'), returnData = TRUE)
# By organ
teste <- plotPCA(vsd, "bat", "organ")
#$y
#[1] "PC2: 22% variance"
#$x
#[1] "PC1: 51% variance"
png(paste0(path, '/results/ALL_Libraries/results/01_DE/PCA_bat_organ_L1_2_3.png'), width = 4000, height = 3000, res = 300)
#teste$labels get variance
ggplot(a, aes(x = PC1, y = PC2, color = organ, shape = bat)) +
geom_point(size = 4, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.3) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.3) +
scale_shape_manual(values=c(15,17)) +
theme_pubr(border = TRUE) +
#coord_fixed(ratio = 1) +
geom_text_repel(aes(label = library), nudge_x = 0.06, size = 5.0, segment.alpha = 0.8) +
theme(axis.text=element_text(size=14),
axis.text.x = element_text(size = 12, hjust = 0.5),
axis.title.y = element_text(size = 18),
legend.text=element_text(size=14),
legend.title=element_text(size=0),
legend.position="bottom",
axis.title.x = element_text(size = 18),
strip.text.x = element_text(size = 20, face = "bold")) +
labs(x="PC1: 51% variance", y = "PC2: 22% variance", element_text(face = "bold")) +
scale_color_manual(values = cols_organ)
dev.off()#The rlog transform
#https://bioc.ism.ac.jp/packages/2.14/bioc/vignettes/DESeq2/inst/doc/beginner.pdf
#highly variable genes. These are the genes most variable across all samples regardless of which samples they are
#rowVars on the output of vst as this corrects for the biased mean/variance trend and puts data on the log scale
rld <- rlog(dds, blind=F )
# Heatmap of Euclidean sample distances after rlog transformation.
sampleDists <- dist( t( assay(rld) ) )
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rld$sample_id, rld$organ, rld$bat , sep="-" )
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_rlog_L1_2_3.png'), width = 4000, height = 3000, res = 300)
colours = colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
d <- pheatmap(sampleDistMatrix, trace="none", col=colours)
d
dev.off()
#Gene clustering
topVarGenes <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 100)
df <- as.data.frame(colData(dds)[,c("bat","organ", "library")])
df$bat = factor(df$bat, levels = c("Bat_1", "Bat_2"), labels = c("Bat_1", "Bat_2"))
metaR_bat <- c("#C0C0C0","#BC8F8F")
names(metaR_bat) <- levels(df$bat)
df$organ = factor(df$organ, levels = c("Skeletal muscle", "Kidney","Intestine"))
metaR_organ <- c("#264D59", "#77A515", "#8B4513")
names(metaR_organ) <- levels(df$organ)
metaR_AnnColour <- list(
bat = metaR_bat)
metaR_AnnColour2 <- list(
organ = metaR_organ)
metaR_AnnColourx <- list(
organ = metaR_organ,
bat = metaR_bat)
# Check the output
metaR_AnnColourx
SampleOrder = order(df$organ, df$bat)
meta.factors <- df[1:2]
df[1:2]
#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_L1.png'), width = 4000, height = 15500, res = 300)
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_L1_2_3.png'), width = 4000, height = 8000, res = 300)
colsHeat<- c("#F7F7F7", "#92C5DE", "#0571B0", "#F4A582", "#CA0020")
pheatmap2 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
cluster_cols = FALSE,
cluster_rows = TRUE,
gaps_row = 5,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,
show_colnames = TRUE,
#color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
color = colorRampPalette(c(colsHeat))(255),
border_color = "#f8edeb",
display_numbers = FALSE)
pheatmap2
dev.off()
dev.off()
##
topVarGenes1000 <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 1000)
mat <- assay(vsd)[ topVarGenes1000, ]
mat <- mat - rowMeans(mat)
#anno <- as.data.frame(colData(vsd)[, c("bat","organ")])
#pheatmap(mat, annotation_col = anno)
write.table(mat,file='/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/results/01_DE/topVarGenes1000_L1_2_3.tsv',quote=FALSE,sep="\t")
##
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_clus_1_L2_3.png'), width = 5000, height = 8000, res = 300)
#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_clus_L1.png'), width = 5000, height = 15500, res = 300)
pheatmap3 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
cluster_cols = TRUE,
cluster_rows = TRUE,
gaps_row = 5,
scale="row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,
show_colnames = TRUE,
#color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
color = colorRampPalette(c(colsHeat))(255),
border_color = "#f8edeb",
display_numbers = FALSE)
pheatmap3
dev.off()counts_norm <- reshape2::melt(counts(dds2, normalized = T), varnames = c('gene_id', 'sample_name'), value.name = 'counts')
counts_norm <- inner_join(x = counts_norm, y = as.data.frame(dds2@colData), by = 'sample_name')
png(paste0(path, '/results/ALL_Libraries/results/01_DE/distribution_L1_2_3.png'), width = 3000, height = 2500, res = 300)
distribution <- ggplot(data = counts_norm, aes(x = reorder(sample_name, bat, na.rm = T), y = log2(counts+1))) + geom_boxplot(aes(fill = factor(counts_norm$organ))) + coord_flip() + xlab('sample_name') + scale_color_manual(values = cols_organ) + theme_pubr(border = TRUE)
distribution
dev.off()We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples) by which we removed 5932 genes and kept 23060 genes for downstream analysis.
The distribution of normalized expression in all samples are shown in figure 1.
There is a good correlation among samples as shown in figure 2. You can see that organs formed separate cluster.
Heatmap of Euclidean sample distances after rlog transformation
In figures ?? and ?? each point corresponds to a sample plotted on PC1 and PC2. I can see strong separation by organ . Library batch effect has been removed.
PCA all
TopVar genes results. Table below shows list of 100 genes
TopVar Clustered independent of the sample
Table with the information of the top 1000 genes
Table with all genes can be found at the Github.
Here, you can find the the GO profile of the topVar 1000 genes
Dinamic Link
We can’t go further due to the limitation in the experimental design
Files delivered to the user with descriptions.
rmarkdown::render("report.Rmd")## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] hrbrthemes_0.8.0 viridis_0.6.2
## [3] viridisLite_0.4.1 ggridges_0.5.4
## [5] biomaRt_2.52.0 rnaseqGene_1.20.0
## [7] fission_1.16.0 RUVSeq_1.30.0
## [9] EDASeq_2.30.0 sva_3.44.0
## [11] mgcv_1.8-41 nlme_3.1-160
## [13] Gviz_1.40.1 ReportingTools_2.36.0
## [15] org.Hs.eg.db_3.15.0 genefilter_1.78.0
## [17] ggbeeswarm_0.6.0 glmpca_0.2.0
## [19] PoiClaClu_1.0.2.1 pheatmap_1.0.12
## [21] hexbin_1.28.2 vsn_3.64.0
## [23] apeglm_1.18.0 magrittr_2.0.3
## [25] tximeta_1.14.1 airway_1.16.0
## [27] BiocStyle_2.24.0 gt_0.8.0
## [29] Rqc_1.30.0 ShortRead_1.54.0
## [31] GenomicAlignments_1.32.1 Rsamtools_2.12.0
## [33] BiocParallel_1.30.4 PCAtools_2.8.0
## [35] factoextra_1.0.7.999 FactoMineR_2.6
## [37] AnnotationDbi_1.58.0 ggfortify_0.4.15
## [39] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [41] Biobase_2.56.0 MatrixGenerics_1.8.1
## [43] matrixStats_0.63.0 GenomicRanges_1.48.0
## [45] forcats_0.5.2 purrr_0.3.5
## [47] readr_2.1.3 tibble_3.1.8
## [49] tidyverse_1.3.2 RColorBrewer_1.1-3
## [51] ggh4x_0.2.3 ggrepel_0.9.2
## [53] reshape_0.8.9 reshape2_1.4.4
## [55] ggpubr_0.5.0 gridExtra_2.3
## [57] gplots_3.1.3 DECIPHER_2.24.0
## [59] RSQLite_2.2.19 Biostrings_2.64.1
## [61] GenomeInfoDb_1.32.4 XVector_0.36.0
## [63] IRanges_2.30.1 S4Vectors_0.34.0
## [65] BiocGenerics_0.42.0 bsselectR_0.1.0
## [67] devtools_2.4.5 usethis_2.1.6
## [69] crosstalk_1.2.0 leaflet_2.1.1
## [71] networkD3_0.4 dygraphs_1.1.1.6
## [73] ggiraph_0.8.4 plotly_4.10.1
## [75] highcharter_0.9.4 ggplot2_3.4.0
## [77] DT_0.26 formattable_0.2.1
## [79] kableExtra_1.3.4 stringr_1.4.1
## [81] tidyr_1.2.1 dplyr_1.0.10
## [83] edgeR_3.38.4 limma_3.52.4
## [85] clusterProfiler_4.4.4 captioner_2.2.3
## [87] bookdown_0.30 knitr_1.41
##
## loaded via a namespace (and not attached):
## [1] graphlayouts_0.8.4 lattice_0.20-45
## [3] haven_2.5.1 vctrs_0.5.1
## [5] Category_2.62.0 RBGL_1.72.0
## [7] blob_1.2.3 survival_3.4-0
## [9] later_1.3.0 R.utils_2.12.2
## [11] DBI_1.1.3 rappdirs_0.3.3
## [13] dqrng_0.3.0 jpeg_0.1-9
## [15] zlibbioc_1.42.0 OrganismDbi_1.38.1
## [17] htmlwidgets_1.5.4 mvtnorm_1.1-3
## [19] leaps_3.1 irlba_2.3.5.1
## [21] markdown_1.4 tidygraph_1.2.2
## [23] Rcpp_1.0.9 KernSmooth_2.23-20
## [25] promises_1.2.0.1 DelayedArray_0.22.0
## [27] pkgload_1.3.2 graph_1.74.0
## [29] Hmisc_4.7-2 fs_1.5.2
## [31] fastmatch_1.1-3 digest_0.6.30
## [33] png_0.1-7 scatterpie_0.1.8
## [35] cowplot_1.1.1 DOSE_3.22.1
## [37] ggraph_2.1.0 pkgconfig_2.0.3
## [39] GO.db_3.15.0 DelayedMatrixStats_1.18.2
## [41] estimability_1.4.1 emdbook_1.3.12
## [43] beeswarm_0.4.0 xfun_0.35
## [45] bslib_0.4.1 zoo_1.8-11
## [47] tidyselect_1.2.0 rtracklayer_1.56.1
## [49] pkgbuild_1.4.0 rlang_1.0.6
## [51] jquerylib_0.1.4 glue_1.6.2
## [53] gdtools_0.2.4 ensembldb_2.20.2
## [55] modelr_0.1.10 emmeans_1.8.2
## [57] multcompView_0.1-8 ggsignif_0.6.4
## [59] GGally_2.1.2 httpuv_1.6.6
## [61] Rttf2pt1_1.3.11 preprocessCore_1.58.0
## [63] TH.data_1.1-1 DO.db_2.9
## [65] annotate_1.74.0 webshot_0.5.4
## [67] jsonlite_1.8.3 bit_4.0.5
## [69] mime_0.12 systemfonts_1.0.4
## [71] stringi_1.7.8 GenomicFiles_1.32.1
## [73] processx_3.8.0 yulab.utils_0.0.5
## [75] bitops_1.0-7 cli_3.4.1
## [77] data.table_1.14.6 timechange_0.1.1
## [79] rstudioapi_0.14 qvalue_2.28.0
## [81] locfit_1.5-9.6 VariantAnnotation_1.42.1
## [83] miniUI_0.1.1.1 gridGraphics_0.5-1
## [85] R.oo_1.25.0 urlchecker_1.0.1
## [87] dbplyr_2.2.1 sessioninfo_1.2.2
## [89] TTR_0.24.3 readxl_1.4.1
## [91] lifecycle_1.0.3 quantmod_0.4.20
## [93] munsell_0.5.0 cellranger_1.1.0
## [95] R.methodsS3_1.8.2 hwriter_1.3.2.1
## [97] caTools_1.18.2 codetools_0.2-18
## [99] coda_0.19-4 vipor_0.4.5
## [101] htmlTable_2.4.1 xtable_1.8-4
## [103] flashClust_1.01-2 googlesheets4_1.0.1
## [105] BiocManager_1.30.19 scatterplot3d_0.3-42
## [107] abind_1.4-5 farver_2.1.1
## [109] AnnotationHub_3.4.0 aplot_0.1.9
## [111] biovizBase_1.44.0 ggtree_3.4.4
## [113] BiocIO_1.6.0 ggbio_1.44.1
## [115] patchwork_1.1.2 profvis_0.3.7
## [117] dichromat_2.0-0.1 cluster_2.1.4
## [119] extrafontdb_1.0 Matrix_1.5-3
## [121] tidytree_0.4.1 ellipsis_0.3.2
## [123] prettyunits_1.1.1 lubridate_1.9.0
## [125] googledrive_2.0.0 reprex_2.0.2
## [127] igraph_1.3.5 fgsea_1.22.0
## [129] remotes_2.4.2 gargle_1.2.1
## [131] htmltools_0.5.3 BiocFileCache_2.4.0
## [133] yaml_2.3.6 GenomicFeatures_1.48.4
## [135] utf8_1.2.2 interactiveDisplayBase_1.34.0
## [137] XML_3.99-0.12 foreign_0.8-83
## [139] withr_2.5.0 GOstats_2.62.0
## [141] aroma.light_3.26.0 bit64_4.0.5
## [143] affyio_1.66.0 multcomp_1.4-20
## [145] ProtGenerics_1.28.0 GOSemSim_2.22.0
## [147] rsvd_1.0.5 ScaledMatrix_1.4.1
## [149] memoise_2.0.1 evaluate_0.18
## [151] extrafont_0.18 geneplotter_1.74.0
## [153] tzdb_0.3.0 callr_3.7.3
## [155] ps_1.7.2 curl_4.3.3
## [157] fansi_1.0.3 highr_0.9
## [159] GSEABase_1.58.0 xts_0.12.2
## [161] checkmate_2.1.0 cachem_1.0.6
## [163] interp_1.1-3 deldir_1.0-6
## [165] rjson_0.2.21 rstatix_0.7.1
## [167] rlist_0.4.6.2 tools_4.2.0
## [169] sass_0.4.4 sandwich_3.0-2
## [171] RCurl_1.98-1.9 car_3.1-1
## [173] ape_5.6-2 ggplotify_0.1.0
## [175] xml2_1.3.3 httr_1.4.4
## [177] assertthat_0.2.1 rmarkdown_2.18
## [179] R6_2.5.1 AnnotationFilter_1.20.0
## [181] nnet_7.3-18 progress_1.2.2
## [183] tximport_1.24.0 KEGGREST_1.36.3
## [185] treeio_1.20.2 gtools_3.9.4
## [187] beachmat_2.12.0 BiocVersion_3.15.2
## [189] BiocSingular_1.12.0 splines_4.2.0
## [191] carData_3.0-5 ggfun_0.0.9
## [193] colorspace_2.0-3 generics_0.1.3
## [195] base64enc_0.1-3 pillar_1.8.1
## [197] Rgraphviz_2.40.0 tweenr_2.0.2
## [199] affy_1.74.0 uuid_1.1-0
## [201] GenomeInfoDbData_1.2.8 plyr_1.8.8
## [203] gtable_0.3.1 bdsmatrix_1.3-6
## [205] rvest_1.0.3 restfulr_0.0.15
## [207] latticeExtra_0.6-30 shadowtext_0.1.2
## [209] fastmap_1.1.0 broom_1.0.1
## [211] BSgenome_1.64.0 scales_1.2.1
## [213] filelock_1.0.2 backports_1.4.1
## [215] enrichplot_1.16.2 hms_1.1.2
## [217] ggforce_0.4.1 shiny_1.7.3
## [219] polyclip_1.10-4 numDeriv_2016.8-1.1
## [221] bbmle_1.0.25 lazyeval_0.2.2
## [223] Formula_1.2-4 crayon_1.5.2
## [225] MASS_7.3-58.1 downloader_0.4
## [227] PFAM.db_3.15.0 sparseMatrixStats_1.8.0
## [229] svglite_2.1.0 AnnotationForge_1.38.1
## [231] rpart_4.1.19 compiler_4.2.0
The responsibility for data archiving lies with the PI of the project. We do not offer long-term storage or retrieval of data.
If you are presenting the results in a paper, at a workshop or conference, we kindly ask you to acknowledge us.
“Support by NBIS (National Bioinformatics Infrastructure Sweden) is gratefully acknowledged.”
“The computations were performed on resources provided by SNIC through Uppsala Multidisciplinary Center for Advanced Computational Science (UPPMAX) under Project SNIC 2022-22-352.”
“The authors would like to acknowledge support from Science for Life Laboratory (SciLifeLab), the National Genomics Infrastructure (NGI), and Uppsala Multidisciplinary Center for Advanced Computational Science (UPPMAX) for providing assistance in massive parallel sequencing and computational infrastructure.”
You should soon be contacted by one of our managers with a request to close down the project in our internal system and for invoicing matters. If we do not hear from you within 30 days the project will be automatically closed and invoice sent. Again, we would like to remind you about data responsibility and acknowledgements, see sections: Data Responsibility and Acknowledgments.
You are welcome to come back to us with further data analysis request at any time via http://nbis.se/support/support.html.
Thank you for using NBIS.